0. Read data
setwd('/Users/mandyhong/Desktop/DA401')
allHC = read.csv('all_race_hc_ts.csv')
blackHC=read.csv('black_hc_ts.csv')
asianHC=read.csv('asian_hc_ts.csv')
summary(allHC)
## year month hate_crime
## Min. :1991 Min. : 1.00 Min. : 198.0
## 1st Qu.:1998 1st Qu.: 3.75 1st Qu.: 319.0
## Median :2006 Median : 6.50 Median : 379.0
## Mean :2006 Mean : 6.50 Mean : 389.1
## 3rd Qu.:2013 3rd Qu.: 9.25 3rd Qu.: 448.0
## Max. :2020 Max. :12.00 Max. :1329.0
summary(blackHC)
## year month hate_crime
## Min. :1991 Min. : 1.00 Min. : 88.0
## 1st Qu.:1998 1st Qu.: 3.75 1st Qu.:167.0
## Median :2006 Median : 6.50 Median :202.0
## Mean :2006 Mean : 6.50 Mean :207.7
## 3rd Qu.:2013 3rd Qu.: 9.25 3rd Qu.:242.5
## Max. :2020 Max. :12.00 Max. :693.0
summary(asianHC)
## year month hate_crime
## Min. :1991 Min. : 1.00 Min. : 3.00
## 1st Qu.:1998 1st Qu.: 3.75 1st Qu.:12.00
## Median :2006 Median : 6.50 Median :17.00
## Mean :2006 Mean : 6.50 Mean :17.84
## 3rd Qu.:2013 3rd Qu.: 9.25 3rd Qu.:23.00
## Max. :2020 Max. :12.00 Max. :51.00
DF = read.csv('census_w_hc_five_agg.csv')
summary(DF)
## YEAR MEDIAN_INCOME MEAN_INCOME POP
## Min. :2018 Min. :43539 Min. : 60672 Min. : 582087
## 1st Qu.:2018 1st Qu.:53334 1st Qu.: 72177 1st Qu.: 1846425
## Median :2018 Median :58671 Median : 77483 Median : 4548930
## Mean :2018 Mean :60177 Mean : 81477 Mean : 6441289
## 3rd Qu.:2018 3rd Qu.:67724 3rd Qu.: 89860 3rd Qu.: 7203582
## Max. :2018 Max. :81744 Max. :110050 Max. :39083067
## ONE_RACE_POP WHITE BLACK ASIAN
## Min. : 565154 Min. : 352867 Min. : 4910 Min. : 5032
## 1st Qu.: 1807184 1st Qu.: 1565282 1st Qu.: 66353 1st Qu.: 38463
## Median : 4449047 Median : 3321945 Median : 347392 Median : 115148
## Mean : 6210087 Mean : 4667506 Mean : 810521 Mean : 350355
## 3rd Qu.: 6806593 3rd Qu.: 5535156 3rd Qu.:1363354 3rd Qu.: 355764
## Max. :36973311 Max. :23264767 Max. :3353300 Max. :5597871
## FOREIGN_BORN_NON_US POVERTY_PERCENT UNEMPLOYMENT_RATE SCHOOL_ENROLLMENT
## Min. : 10194 Min. : 7.90 Min. :2.840 Min. : 146042
## 1st Qu.: 69961 1st Qu.:11.13 1st Qu.:4.770 1st Qu.: 464009
## Median : 153082 Median :13.59 Median :5.910 Median : 1112828
## Mean : 441622 Mean :13.54 Mean :5.716 Mean : 1624249
## 3rd Qu.: 431093 3rd Qu.:15.41 3rd Qu.:6.660 3rd Qu.: 1753807
## Max. :5151904 Max. :20.90 Max. :8.240 Max. :10458700
## X18_24_BACHELOR_HIGHER X25_34_BACHELOR_HIGHER X35_44_BACHELOR_HIGHER
## Min. : 3840 Min. : 21763 Min. : 21045
## 1st Qu.: 14126 1st Qu.: 65052 1st Qu.: 65662
## Median : 35836 Median : 183529 Median : 177531
## Mean : 66872 Mean : 310356 Mean : 289945
## 3rd Qu.: 87784 3rd Qu.: 405686 3rd Qu.: 387764
## Max. :398819 Max. :2092631 Max. :1870672
## X45_64_BACHELOR_HIGHER X65_MORE_BACHELOR_HIGHER WHITE_PER_CAPITA
## Min. : 39291 Min. : 23273 Min. :24.85
## 1st Qu.: 140207 1st Qu.: 74472 1st Qu.:68.30
## Median : 325318 Median : 178468 Median :77.89
## Mean : 510497 Mean : 262811 Mean :76.45
## 3rd Qu.: 725982 3rd Qu.: 346725 3rd Qu.:85.22
## Max. :3113981 Max. :1640027 Max. :94.38
## BLACK_PER_CAPITA ASIAN_PER_CAPITA ONE_RACE_RATIO FOREIGN_BORN_NON_US_RATIO
## Min. : 0.4708 Min. : 0.7638 Min. :76.03 Min. : 0.796
## 1st Qu.: 3.3315 1st Qu.: 1.5428 1st Qu.:96.50 1st Qu.: 2.498
## Median : 7.3649 Median : 2.7109 Median :96.99 Median : 3.928
## Mean :10.5345 Mean : 4.1741 Mean :96.29 Mean : 4.786
## 3rd Qu.:15.0909 3rd Qu.: 4.6028 3rd Qu.:97.43 3rd Qu.: 6.780
## Max. :37.6340 Max. :37.8426 Max. :98.63 Max. :13.184
## SCHOOL_ENROLLMENT_RATE ALL_HATE_CRIME BLACK_HATE_CRIME ASIAN_HATE_CRIME
## Min. :21.49 Min. : 6.00 Min. : 2.40 Min. : 0.20
## 1st Qu.:24.03 1st Qu.: 28.25 1st Qu.: 10.90 1st Qu.: 0.80
## Median :24.95 Median : 67.50 Median : 26.00 Median : 2.70
## Mean :24.89 Mean : 147.22 Mean : 54.83 Mean : 6.28
## 3rd Qu.:25.52 3rd Qu.: 163.75 3rd Qu.: 56.70 3rd Qu.: 7.30
## Max. :31.90 Max. :1086.60 Max. :454.40 Max. :46.40
## ALL_HC_PER_CAPITA BLACK_HC_PER_CAPITA_POP BLACK_HC_PER_CAPITA_RACE_POP
## Min. : 0.3255 Min. : 0.0951 Min. : 0.6937
## 1st Qu.: 1.3407 1st Qu.: 0.4776 1st Qu.: 17.5300
## Median : 2.4570 Median : 0.9124 Median : 47.4428
## Mean : 6.2577 Mean : 2.2650 Mean : 94.6701
## 3rd Qu.: 6.8685 3rd Qu.: 2.2786 3rd Qu.: 87.9626
## Max. :51.5159 Max. :19.8888 Max. :1732.7170
## ASIAN_HC_PER_CAPITA_POP ASIAN_HC_PER_CAPITA_RACE_POP BACHELOR_RATE
## Min. :0.002662 Min. : 0.0302 Min. :14.99
## 1st Qu.:0.024540 1st Qu.: 0.9113 1st Qu.:19.25
## Median :0.081968 Median : 3.9117 Median :21.18
## Mean :0.254118 Mean : 15.1502 Mean :21.80
## 3rd Qu.:0.241591 3rd Qu.: 11.7414 3rd Qu.:24.25
## Max. :3.001644 Max. :339.4914 Max. :31.67
DF2=DF[c("MEDIAN_INCOME","POP", "WHITE_PER_CAPITA", "BLACK_PER_CAPITA", "ASIAN_PER_CAPITA","FOREIGN_BORN_NON_US_RATIO", "POVERTY_PERCENT", "UNEMPLOYMENT_RATE", "SCHOOL_ENROLLMENT_RATE", "BACHELOR_RATE","ALL_HC_PER_CAPITA", "BLACK_HC_PER_CAPITA_POP", "ASIAN_HC_PER_CAPITA_POP")]
summary(DF2)
## MEDIAN_INCOME POP WHITE_PER_CAPITA BLACK_PER_CAPITA
## Min. :43539 Min. : 582087 Min. :24.85 Min. : 0.4708
## 1st Qu.:53334 1st Qu.: 1846425 1st Qu.:68.30 1st Qu.: 3.3315
## Median :58671 Median : 4548930 Median :77.89 Median : 7.3649
## Mean :60177 Mean : 6441289 Mean :76.45 Mean :10.5345
## 3rd Qu.:67724 3rd Qu.: 7203582 3rd Qu.:85.22 3rd Qu.:15.0909
## Max. :81744 Max. :39083067 Max. :94.38 Max. :37.6340
## ASIAN_PER_CAPITA FOREIGN_BORN_NON_US_RATIO POVERTY_PERCENT UNEMPLOYMENT_RATE
## Min. : 0.7638 Min. : 0.796 Min. : 7.90 Min. :2.840
## 1st Qu.: 1.5428 1st Qu.: 2.498 1st Qu.:11.13 1st Qu.:4.770
## Median : 2.7109 Median : 3.928 Median :13.59 Median :5.910
## Mean : 4.1741 Mean : 4.786 Mean :13.54 Mean :5.716
## 3rd Qu.: 4.6028 3rd Qu.: 6.780 3rd Qu.:15.41 3rd Qu.:6.660
## Max. :37.8426 Max. :13.184 Max. :20.90 Max. :8.240
## SCHOOL_ENROLLMENT_RATE BACHELOR_RATE ALL_HC_PER_CAPITA
## Min. :21.49 Min. :14.99 Min. : 0.3255
## 1st Qu.:24.03 1st Qu.:19.25 1st Qu.: 1.3407
## Median :24.95 Median :21.18 Median : 2.4570
## Mean :24.89 Mean :21.80 Mean : 6.2577
## 3rd Qu.:25.52 3rd Qu.:24.25 3rd Qu.: 6.8685
## Max. :31.90 Max. :31.67 Max. :51.5159
## BLACK_HC_PER_CAPITA_POP ASIAN_HC_PER_CAPITA_POP
## Min. : 0.0951 Min. :0.002662
## 1st Qu.: 0.4776 1st Qu.:0.024540
## Median : 0.9124 Median :0.081968
## Mean : 2.2650 Mean :0.254118
## 3rd Qu.: 2.2786 3rd Qu.:0.241591
## Max. :19.8888 Max. :3.001644
sum(is.na(DF2)==TRUE)
## [1] 0
write.csv(DF2,'/Users/mandyhong/Desktop/DA401/census_w_hc_FINAL.csv', row.names = FALSE)
1. Descriptive Analysis
1.1. Get time series plots for all three data
#1. tsplot
tsplot(allHC$hate_crime)

tsplot(blackHC$hate_crime)

tsplot(asianHC$hate_crime)

culer = c(rgb(.85,.30,.12,.6), rgb(.12,.65,.85,.6), "aquamarine3")
tsplot(allHC$hate_crime, col=culer[1], lwd=2, pch=20, ylim=c(min(asianHC$hate_crime), max(allHC$hate_crime))
,ylab="Hate crimes", main="Racial Hate Crimes")
lines(blackHC$hate_crime, col=culer[2], lwd=2, pch=20)
lines(asianHC$hate_crime, col=culer[3], lwd=2, pch=20)
legend("topleft", col=culer, lty=1, lwd=2, pch=20, legend=c("All race", "African American/Black", "Asian"), bg="white")

culer = c(rgb(.85,.30,.12,.6), rgb(.12,.65,.85,.6), "aquamarine3")
tsplot(log(allHC$hate_crime), col=culer[1], lwd=2, pch=20, ylim=c(min(log(asianHC$hate_crime)), max(log(allHC$hate_crime)+1))
,ylab="log(Hate crimes)", main="Racial Hate Crimes")
lines(log(blackHC$hate_crime), col=culer[2], lwd=2, pch=20)
lines(log(asianHC$hate_crime), col=culer[3], lwd=2, pch=20)
legend("topleft", col=culer, lty=1, lwd=2, pch=20, legend=c("All race", "African American/Black", "Asian"), bg="white")

#2. time series plot with date in x-axis
ts1 <- xts(allHC$hate_crime, as.POSIXct(sprintf("%d-%d-01", allHC$year, allHC$month)))
ts2 <- xts(allHC$hate_crime, as.yearmon(allHC$year + (allHC$month-1)/12))
plot(ts2, main="All Racial Hate Crimes", col=culer[1])

ts3 <- xts(blackHC$hate_crime, as.POSIXct(sprintf("%d-%d-01", blackHC$year, blackHC$month)))
ts4 <- xts(blackHC$hate_crime, as.yearmon(blackHC$year + (blackHC$month-1)/12))
plot(ts4, main="Anti-African American or Black Hate Crimes",col=culer[2])

ts5<- xts(asianHC$hate_crime, as.POSIXct(sprintf("%d-%d-01", asianHC$year, asianHC$month)))
ts6 <- xts(asianHC$hate_crime, as.yearmon(asianHC$year + (asianHC$month-1)/12))
plot(ts6, main="Anti-Asian Hate Crimes", col=culer[3])

1.2. Summary statistics and distribution
summary(allHC)
## year month hate_crime
## Min. :1991 Min. : 1.00 Min. : 198.0
## 1st Qu.:1998 1st Qu.: 3.75 1st Qu.: 319.0
## Median :2006 Median : 6.50 Median : 379.0
## Mean :2006 Mean : 6.50 Mean : 389.1
## 3rd Qu.:2013 3rd Qu.: 9.25 3rd Qu.: 448.0
## Max. :2020 Max. :12.00 Max. :1329.0
sd(allHC$hate_crime)
## [1] 112.6868
summary(blackHC)
## year month hate_crime
## Min. :1991 Min. : 1.00 Min. : 88.0
## 1st Qu.:1998 1st Qu.: 3.75 1st Qu.:167.0
## Median :2006 Median : 6.50 Median :202.0
## Mean :2006 Mean : 6.50 Mean :207.7
## 3rd Qu.:2013 3rd Qu.: 9.25 3rd Qu.:242.5
## Max. :2020 Max. :12.00 Max. :693.0
sd(blackHC$hate_crime)
## [1] 61.32818
summary(asianHC)
## year month hate_crime
## Min. :1991 Min. : 1.00 Min. : 3.00
## 1st Qu.:1998 1st Qu.: 3.75 1st Qu.:12.00
## Median :2006 Median : 6.50 Median :17.00
## Mean :2006 Mean : 6.50 Mean :17.84
## 3rd Qu.:2013 3rd Qu.: 9.25 3rd Qu.:23.00
## Max. :2020 Max. :12.00 Max. :51.00
sd(asianHC$hate_crime)
## [1] 8.30969
#distribution
hist(allHC$hate_crime, col=culer[1], main="Distribution of All Racial Hate Crimes", xlab="Number of Hate Crimes")

hist(blackHC$hate_crime, col=culer[2], main="Distribution of Anti-Black Racial Hate Crimes", xlab="Number of Hate Crimes")

hist(asianHC$hate_crime, col=culer[3], main="Distribution of Anti-Asian Racial Hate Crimes", xlab="Number of Hate Crimes")

1.3. Check and describe outliers
allHC$difference<-c(0,diff(allHC$hate_crime))
iqr = IQR(diff(allHC$hate_crime))
Q <- quantile(allHC$difference, probs=c(.25, .75), na.rm = FALSE)
high <- Q[2]+1.5*iqr
low <- Q[1]-1.5*iqr
tsplot(allHC$difference, main="Detecting Outliers Using IQR Score: All Racial Hate Crimes", ylab="Differenced(hate crime)")
abline(a = high, 0, lty = 2, col = 'red')
abline(a = low, 0, lty = 2, col = 'red')

blackHC$difference<-c(0,diff(blackHC$hate_crime))
iqr = IQR(diff(blackHC$hate_crime))
Q <- quantile(blackHC$difference, probs=c(.25, .75), na.rm = FALSE)
#Qtest<-quantile(blackHC$difference)
#Qtest[4] #3rd quntile
high <- Q[2]+1.5*iqr
low <- Q[1]-1.5*iqr
tsplot(blackHC$difference,main="Detecting Outliers Using IQR Score: Anti-Black Hate Crimes", ylab ="Differenced(hate crime)" )
abline(a = high, 0, lty = 2, col = 'red')
abline(a = low, 0, lty = 2, col = 'red')

#blackHC[blackHC$difference > high, ]
#blackHC[blackHC$difference < low, ]
#outliers=c(which(blackHC$difference > high), which(blackHC$difference < low)) #12 outliers
#blackHC_no_outliers=blackHC[-outliers,]
#tsplot(blackHC_no_outliers$difference) #getting rid of outliers look stationary. In this case can we exclude outliers?
#tsplot(diff(blackHC_no_outliers$hate_crime)) #why do this plot and the plot above looks different?
#check for the outliers
asianHC$difference<-c(0,diff(asianHC$hate_crime))
iqr = IQR(diff(asianHC$hate_crime))
Q <- quantile(asianHC$difference, probs=c(.25, .75), na.rm = FALSE)
high <- Q[2]+1.5*iqr
low <- Q[1]-1.5*iqr
tsplot(asianHC$difference, main="Detecting Outliers Using IQR Score: Anti-Asian Hate Crimes", ylab="Differenced(hate crime)")
abline(a = high, 0, lty = 2, col = 'red')
abline(a = low, 0, lty = 2, col = 'red')

#asianHC[asianHC$difference > high, ]
#asianHC[asianHC$difference < low, ]
#outliers=c(which(asianHC$difference > high), which(asianHC$difference < low)) #8 outliers
#asianHC_no_outliers=asianHC[-outliers,]
2. Predictive Analysis: stepwise multiple regression
set.seed(123)
#0. create train and test data
indices = sample(2, nrow(DF2), replace = TRUE, prob=c(0.8,0.2)) #80% train data
train = DF2[indices == 1,]
test = DF2[indices == 2,]
plot(train)

#1. all racial hate crime
#model without hate crime variables
mod_all2 <- lm(ALL_HC_PER_CAPITA ~ . -BLACK_HC_PER_CAPITA_POP -ASIAN_HC_PER_CAPITA_POP, data=DF2)
summary(mod_all2) #0.9796
##
## Call:
## lm(formula = ALL_HC_PER_CAPITA ~ . - BLACK_HC_PER_CAPITA_POP -
## ASIAN_HC_PER_CAPITA_POP, data = DF2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.497 -4.023 -0.849 1.866 31.907
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.671e+02 6.050e+01 -2.761 0.00873 **
## MEDIAN_INCOME 9.784e-04 4.693e-04 2.085 0.04370 *
## POP 2.214e-07 2.333e-07 0.949 0.34848
## WHITE_PER_CAPITA 8.272e-01 3.641e-01 2.272 0.02867 *
## BLACK_PER_CAPITA 4.926e-01 2.925e-01 1.684 0.10014
## ASIAN_PER_CAPITA 1.128e+00 6.568e-01 1.717 0.09383 .
## FOREIGN_BORN_NON_US_RATIO 6.640e-01 8.027e-01 0.827 0.41315
## POVERTY_PERCENT 3.247e+00 1.438e+00 2.258 0.02959 *
## UNEMPLOYMENT_RATE 1.202e-01 1.802e+00 0.067 0.94718
## SCHOOL_ENROLLMENT_RATE -4.540e-01 9.282e-01 -0.489 0.62751
## BACHELOR_RATE 1.538e-01 5.960e-01 0.258 0.79777
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.701 on 39 degrees of freedom
## Multiple R-squared: 0.4624, Adjusted R-squared: 0.3245
## F-statistic: 3.354 on 10 and 39 DF, p-value: 0.003123
vif(mod_all2)
## MEDIAN_INCOME POP WHITE_PER_CAPITA
## 17.704275 2.364849 17.888133
## BLACK_PER_CAPITA ASIAN_PER_CAPITA FOREIGN_BORN_NON_US_RATIO
## 6.414012 10.986843 4.365748
## POVERTY_PERCENT UNEMPLOYMENT_RATE SCHOOL_ENROLLMENT_RATE
## 14.419097 3.858263 1.877929
## BACHELOR_RATE
## 4.579832
mod_all_back2=step(mod_all2, direction="backward", trace=0)
par(mfrow=c(2,2))
plot(mod_all_back2)

summary(mod_all_back2)
##
## Call:
## lm(formula = ALL_HC_PER_CAPITA ~ MEDIAN_INCOME + WHITE_PER_CAPITA +
## BLACK_PER_CAPITA + ASIAN_PER_CAPITA + FOREIGN_BORN_NON_US_RATIO +
## POVERTY_PERCENT, data = DF2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.117 -4.083 -1.195 1.683 30.704
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.996e+02 5.093e+01 -3.919 0.000314 ***
## MEDIAN_INCOME 1.037e-03 3.369e-04 3.077 0.003633 **
## WHITE_PER_CAPITA 1.014e+00 3.012e-01 3.367 0.001610 **
## BLACK_PER_CAPITA 6.411e-01 2.412e-01 2.658 0.011000 *
## ASIAN_PER_CAPITA 1.438e+00 5.553e-01 2.590 0.013043 *
## FOREIGN_BORN_NON_US_RATIO 1.100e+00 5.407e-01 2.034 0.048106 *
## POVERTY_PERCENT 3.541e+00 1.103e+00 3.211 0.002503 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.462 on 43 degrees of freedom
## Multiple R-squared: 0.4435, Adjusted R-squared: 0.3658
## F-statistic: 5.71 on 6 and 43 DF, p-value: 0.0001961
vif(mod_all_back2)
## MEDIAN_INCOME WHITE_PER_CAPITA BLACK_PER_CAPITA
## 9.714941 13.039657 4.646942
## ASIAN_PER_CAPITA FOREIGN_BORN_NON_US_RATIO POVERTY_PERCENT
## 8.362728 2.109388 9.032896
#2. anti-black hate crime
#model without hate crime variables
mod_black2 <- lm(BLACK_HC_PER_CAPITA_POP ~ . -ALL_HC_PER_CAPITA-ASIAN_HC_PER_CAPITA_POP, data=train)
summary(mod_black2)
##
## Call:
## lm(formula = BLACK_HC_PER_CAPITA_POP ~ . - ALL_HC_PER_CAPITA -
## ASIAN_HC_PER_CAPITA_POP, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6105 -1.7486 -0.2017 0.8917 10.3159
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.435e+01 2.551e+01 -2.523 0.0176 *
## MEDIAN_INCOME 3.095e-04 2.224e-04 1.391 0.1750
## POP 7.469e-08 1.229e-07 0.608 0.5483
## WHITE_PER_CAPITA 3.062e-01 1.515e-01 2.021 0.0529 .
## BLACK_PER_CAPITA 1.942e-01 1.445e-01 1.344 0.1897
## ASIAN_PER_CAPITA 1.416e+00 5.645e-01 2.508 0.0182 *
## FOREIGN_BORN_NON_US_RATIO -1.962e-01 3.940e-01 -0.498 0.6224
## POVERTY_PERCENT 1.355e+00 6.672e-01 2.031 0.0518 .
## UNEMPLOYMENT_RATE -3.618e-01 7.536e-01 -0.480 0.6348
## SCHOOL_ENROLLMENT_RATE -1.152e-02 4.136e-01 -0.028 0.9780
## BACHELOR_RATE 9.194e-02 2.876e-01 0.320 0.7516
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.915 on 28 degrees of freedom
## Multiple R-squared: 0.4654, Adjusted R-squared: 0.2745
## F-statistic: 2.438 on 10 and 28 DF, p-value: 0.0309
vif(mod_black2)
## MEDIAN_INCOME POP WHITE_PER_CAPITA
## 16.501417 2.107509 9.480899
## BLACK_PER_CAPITA ASIAN_PER_CAPITA FOREIGN_BORN_NON_US_RATIO
## 6.996064 6.399358 4.655469
## POVERTY_PERCENT UNEMPLOYMENT_RATE SCHOOL_ENROLLMENT_RATE
## 13.298666 3.688606 2.298976
## BACHELOR_RATE
## 4.804371
mod_black_back2=step(mod_black2, direction="backward", trace=0)
par(mfrow=c(2,2))
plot(mod_black_back2)

summary(mod_black_back2)
##
## Call:
## lm(formula = BLACK_HC_PER_CAPITA_POP ~ MEDIAN_INCOME + WHITE_PER_CAPITA +
## BLACK_PER_CAPITA + ASIAN_PER_CAPITA + POVERTY_PERCENT, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4842 -1.4136 -0.3367 0.9879 10.2929
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.622e+01 2.181e+01 -3.036 0.00466 **
## MEDIAN_INCOME 3.000e-04 1.615e-04 1.858 0.07217 .
## WHITE_PER_CAPITA 3.595e-01 1.209e-01 2.973 0.00547 **
## BLACK_PER_CAPITA 2.502e-01 1.055e-01 2.372 0.02370 *
## ASIAN_PER_CAPITA 1.326e+00 4.040e-01 3.283 0.00243 **
## POVERTY_PERCENT 1.147e+00 5.038e-01 2.277 0.02937 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.723 on 33 degrees of freedom
## Multiple R-squared: 0.4504, Adjusted R-squared: 0.3672
## F-statistic: 5.41 on 5 and 33 DF, p-value: 0.0009665
vif(mod_black_back2)
## MEDIAN_INCOME WHITE_PER_CAPITA BLACK_PER_CAPITA ASIAN_PER_CAPITA
## 9.977125 6.922026 4.275537 3.757292
## POVERTY_PERCENT
## 8.691638
#3. anti-asian hate crime
#model without hate crime variables
mod_asian2 <- lm(ASIAN_HC_PER_CAPITA_POP ~ . -ALL_HC_PER_CAPITA-BLACK_HC_PER_CAPITA_POP, data=train)
summary(mod_asian2)
##
## Call:
## lm(formula = ASIAN_HC_PER_CAPITA_POP ~ . - ALL_HC_PER_CAPITA -
## BLACK_HC_PER_CAPITA_POP, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.49454 -0.23939 -0.00395 0.09092 1.71530
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.193e+00 3.788e+00 -1.899 0.0679 .
## MEDIAN_INCOME 4.511e-05 3.303e-05 1.366 0.1829
## POP 7.520e-10 1.825e-08 0.041 0.9674
## WHITE_PER_CAPITA 3.145e-02 2.250e-02 1.398 0.1732
## BLACK_PER_CAPITA 2.311e-02 2.146e-02 1.077 0.2908
## ASIAN_PER_CAPITA 1.434e-01 8.384e-02 1.711 0.0982 .
## FOREIGN_BORN_NON_US_RATIO 1.433e-02 5.851e-02 0.245 0.8083
## POVERTY_PERCENT 1.695e-01 9.909e-02 1.711 0.0981 .
## UNEMPLOYMENT_RATE -7.773e-02 1.119e-01 -0.695 0.4930
## SCHOOL_ENROLLMENT_RATE -2.057e-02 6.143e-02 -0.335 0.7402
## BACHELOR_RATE 8.014e-03 4.271e-02 0.188 0.8525
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4329 on 28 degrees of freedom
## Multiple R-squared: 0.4116, Adjusted R-squared: 0.2014
## F-statistic: 1.959 on 10 and 28 DF, p-value: 0.07867
vif(mod_asian2)
## MEDIAN_INCOME POP WHITE_PER_CAPITA
## 16.501417 2.107509 9.480899
## BLACK_PER_CAPITA ASIAN_PER_CAPITA FOREIGN_BORN_NON_US_RATIO
## 6.996064 6.399358 4.655469
## POVERTY_PERCENT UNEMPLOYMENT_RATE SCHOOL_ENROLLMENT_RATE
## 13.298666 3.688606 2.298976
## BACHELOR_RATE
## 4.804371
mod_asian_back2=step(mod_asian2, direction="backward", trace=0)
par(mfrow=c(2,2))
plot(mod_asian_back2)

summary(mod_asian_back2)
##
## Call:
## lm(formula = ASIAN_HC_PER_CAPITA_POP ~ MEDIAN_INCOME + WHITE_PER_CAPITA +
## BLACK_PER_CAPITA + ASIAN_PER_CAPITA + POVERTY_PERCENT, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.46558 -0.25852 -0.00099 0.16308 1.74991
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.555e+00 3.236e+00 -2.334 0.0258 *
## MEDIAN_INCOME 3.961e-05 2.396e-05 1.653 0.1078
## WHITE_PER_CAPITA 3.676e-02 1.794e-02 2.049 0.0485 *
## BLACK_PER_CAPITA 2.598e-02 1.565e-02 1.660 0.1064
## ASIAN_PER_CAPITA 1.556e-01 5.994e-02 2.596 0.0140 *
## POVERTY_PERCENT 1.319e-01 7.474e-02 1.765 0.0869 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4039 on 33 degrees of freedom
## Multiple R-squared: 0.3963, Adjusted R-squared: 0.3048
## F-statistic: 4.332 on 5 and 33 DF, p-value: 0.003856
vif(mod_asian_back2)
## MEDIAN_INCOME WHITE_PER_CAPITA BLACK_PER_CAPITA ASIAN_PER_CAPITA
## 9.977125 6.922026 4.275537 3.757292
## POVERTY_PERCENT
## 8.691638
3. Predictive Analysis: k nearest neighbors
set.seed(123)
#1. all racial hate crime
#Let caret search for best k
trctrl = trainControl(method = "cv", number = 10) #10-fold cv
simple_fit3= train(ALL_HC_PER_CAPITA~ .-BLACK_HC_PER_CAPITA_POP -ASIAN_HC_PER_CAPITA_POP,
data=train,
method = "knn",
trControl = trctrl,
tuneLength = 50)
plot(simple_fit3)

simple_fit3
## k-Nearest Neighbors
##
## 39 samples
## 12 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 35, 35, 35, 35, 35, 35, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 6.322412 0.4237717 4.551636
## 7 6.420670 0.4056259 4.601590
## 9 6.607362 0.3613087 4.776129
## 11 6.307716 0.3844472 4.477374
## 13 6.180925 0.4416995 4.267641
## 15 6.152784 0.4458449 4.209575
## 17 6.156900 0.4387926 4.297701
## 19 6.010273 0.4613615 4.230657
## 21 5.953789 0.4649601 4.242796
## 23 5.963167 0.4642294 4.232403
## 25 6.044545 0.4504058 4.313037
## 27 6.479173 0.4084240 4.832036
## 29 6.933877 0.4108113 5.363944
## 31 7.064728 0.4432337 5.492342
## 33 7.050698 0.3983654 5.456585
## 35 7.065644 0.4098173 5.471668
## 37 7.062176 0.4098173 5.475572
## 39 7.062176 0.4098173 5.475572
## 41 7.062176 0.4098173 5.475572
## 43 7.062176 0.4098173 5.475572
## 45 7.062176 0.4098173 5.475572
## 47 7.062176 0.4098173 5.475572
## 49 7.062176 0.4098173 5.475572
## 51 7.062176 0.4098173 5.475572
## 53 7.062176 0.4098173 5.475572
## 55 7.062176 0.4098173 5.475572
## 57 7.062176 0.4098173 5.475572
## 59 7.062176 0.4098173 5.475572
## 61 7.062176 0.4098173 5.475572
## 63 7.062176 0.4098173 5.475572
## 65 7.062176 0.4098173 5.475572
## 67 7.062176 0.4098173 5.475572
## 69 7.062176 0.4098173 5.475572
## 71 7.062176 0.4098173 5.475572
## 73 7.062176 0.4098173 5.475572
## 75 7.062176 0.4098173 5.475572
## 77 7.062176 0.4098173 5.475572
## 79 7.062176 0.4098173 5.475572
## 81 7.062176 0.4098173 5.475572
## 83 7.062176 0.4098173 5.475572
## 85 7.062176 0.4098173 5.475572
## 87 7.062176 0.4098173 5.475572
## 89 7.062176 0.4098173 5.475572
## 91 7.062176 0.4098173 5.475572
## 93 7.062176 0.4098173 5.475572
## 95 7.062176 0.4098173 5.475572
## 97 7.062176 0.4098173 5.475572
## 99 7.062176 0.4098173 5.475572
## 101 7.062176 0.4098173 5.475572
## 103 7.062176 0.4098173 5.475572
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 21.
#2. anti-black hate crime
simple_fit4= train(BLACK_HC_PER_CAPITA_POP~. -ALL_HC_PER_CAPITA -ASIAN_HC_PER_CAPITA_POP,
data=train,
method = "knn",
trControl = trctrl,
tuneLength = 50)
plot(simple_fit4)

simple_fit4
## k-Nearest Neighbors
##
## 39 samples
## 12 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 35, 35, 35, 35, 35, 35, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 2.644446 0.4748750 1.795815
## 7 2.422895 0.5830589 1.732380
## 9 2.360427 0.6349595 1.722547
## 11 2.253279 0.6551436 1.621244
## 13 2.166296 0.6180502 1.509124
## 15 2.173714 0.5877432 1.528599
## 17 2.151263 0.5910766 1.499530
## 19 2.156769 0.5669521 1.531792
## 21 2.127495 0.5865246 1.534167
## 23 2.136914 0.5696179 1.535365
## 25 2.126744 0.5911237 1.533855
## 27 2.288571 0.5784928 1.675899
## 29 2.458393 0.5898117 1.866354
## 31 2.501090 0.6271986 1.883699
## 33 2.494183 0.5065472 1.875172
## 35 2.503459 0.3695950 1.880769
## 37 2.502700 0.3695950 1.882446
## 39 2.502700 0.3695950 1.882446
## 41 2.502700 0.3695950 1.882446
## 43 2.502700 0.3695950 1.882446
## 45 2.502700 0.3695950 1.882446
## 47 2.502700 0.3695950 1.882446
## 49 2.502700 0.3695950 1.882446
## 51 2.502700 0.3695950 1.882446
## 53 2.502700 0.3695950 1.882446
## 55 2.502700 0.3695950 1.882446
## 57 2.502700 0.3695950 1.882446
## 59 2.502700 0.3695950 1.882446
## 61 2.502700 0.3695950 1.882446
## 63 2.502700 0.3695950 1.882446
## 65 2.502700 0.3695950 1.882446
## 67 2.502700 0.3695950 1.882446
## 69 2.502700 0.3695950 1.882446
## 71 2.502700 0.3695950 1.882446
## 73 2.502700 0.3695950 1.882446
## 75 2.502700 0.3695950 1.882446
## 77 2.502700 0.3695950 1.882446
## 79 2.502700 0.3695950 1.882446
## 81 2.502700 0.3695950 1.882446
## 83 2.502700 0.3695950 1.882446
## 85 2.502700 0.3695950 1.882446
## 87 2.502700 0.3695950 1.882446
## 89 2.502700 0.3695950 1.882446
## 91 2.502700 0.3695950 1.882446
## 93 2.502700 0.3695950 1.882446
## 95 2.502700 0.3695950 1.882446
## 97 2.502700 0.3695950 1.882446
## 99 2.502700 0.3695950 1.882446
## 101 2.502700 0.3695950 1.882446
## 103 2.502700 0.3695950 1.882446
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 25.
#3. anti-asian hate crime
simple_fit5= train(ASIAN_HC_PER_CAPITA_POP~. -ALL_HC_PER_CAPITA -BLACK_HC_PER_CAPITA_POP,
data=train,
method = "knn",
trControl = trctrl,
tuneLength = 50)
plot(simple_fit5)

simple_fit5
## k-Nearest Neighbors
##
## 39 samples
## 12 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 35, 35, 35, 35, 35, 35, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 0.3691158 0.5747832 0.2394554
## 7 0.3289058 0.6438099 0.2202018
## 9 0.3028478 0.6967459 0.2024725
## 11 0.2880529 0.6554908 0.2007762
## 13 0.2743234 0.6403317 0.1896897
## 15 0.2696925 0.5931313 0.1883349
## 17 0.2627104 0.5862949 0.1849407
## 19 0.2612330 0.5943251 0.1800576
## 21 0.2542633 0.5985920 0.1741475
## 23 0.2535920 0.5562669 0.1729369
## 25 0.2528757 0.5025716 0.1751082
## 27 0.2786101 0.5570465 0.2011020
## 29 0.2968264 0.5488665 0.2182805
## 31 0.3017404 0.5206127 0.2234350
## 33 0.3006905 0.4938958 0.2219397
## 35 0.3015118 0.4560191 0.2232957
## 37 0.3016995 0.4560191 0.2234949
## 39 0.3016995 0.4560191 0.2234949
## 41 0.3016995 0.4560191 0.2234949
## 43 0.3016995 0.4560191 0.2234949
## 45 0.3016995 0.4560191 0.2234949
## 47 0.3016995 0.4560191 0.2234949
## 49 0.3016995 0.4560191 0.2234949
## 51 0.3016995 0.4560191 0.2234949
## 53 0.3016995 0.4560191 0.2234949
## 55 0.3016995 0.4560191 0.2234949
## 57 0.3016995 0.4560191 0.2234949
## 59 0.3016995 0.4560191 0.2234949
## 61 0.3016995 0.4560191 0.2234949
## 63 0.3016995 0.4560191 0.2234949
## 65 0.3016995 0.4560191 0.2234949
## 67 0.3016995 0.4560191 0.2234949
## 69 0.3016995 0.4560191 0.2234949
## 71 0.3016995 0.4560191 0.2234949
## 73 0.3016995 0.4560191 0.2234949
## 75 0.3016995 0.4560191 0.2234949
## 77 0.3016995 0.4560191 0.2234949
## 79 0.3016995 0.4560191 0.2234949
## 81 0.3016995 0.4560191 0.2234949
## 83 0.3016995 0.4560191 0.2234949
## 85 0.3016995 0.4560191 0.2234949
## 87 0.3016995 0.4560191 0.2234949
## 89 0.3016995 0.4560191 0.2234949
## 91 0.3016995 0.4560191 0.2234949
## 93 0.3016995 0.4560191 0.2234949
## 95 0.3016995 0.4560191 0.2234949
## 97 0.3016995 0.4560191 0.2234949
## 99 0.3016995 0.4560191 0.2234949
## 101 0.3016995 0.4560191 0.2234949
## 103 0.3016995 0.4560191 0.2234949
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 25.
4. Cross validation using test data
4.1. Calculate RMSE and normalized RMSE
set.seed(123)
#1. all racial hate crime: stepwise multiple regression with 10 variables
test_pred4.2 = predict(mod_all_back2, newdata = test)
test_pred_df4.2=as.data.frame(test_pred4.2)
RMSE_4.2=sqrt(mean((test$ALL_HC_PER_CAPITA - test_pred_df4.2$test_pred4.2)^2))
N_RMSE_4.2=RMSE_4.2/(max(test$ALL_HC_PER_CAPITA)-min(test$ALL_HC_PER_CAPITA))
round(RMSE_4.2 , digits = 3)
## [1] 5.95
round(N_RMSE_4.2 , digits = 3)
## [1] 0.191
#2. anti-black hate crime: stepwise multiple regression with 10 variables
test_pred5.2 = predict(mod_black_back2, newdata = test)
test_pred_df5.2=as.data.frame(test_pred5.2)
RMSE_5.2=sqrt(mean((test$BLACK_HC_PER_CAPITA_POP - test_pred_df5.2$test_pred5)^2))
N_RMSE_5.2=RMSE_5.2/(max(test$BLACK_HC_PER_CAPITA_POP)-min(test$BLACK_HC_PER_CAPITA_POP))
round(RMSE_5.2 , digits = 3)
## [1] 8.784
round(N_RMSE_5.2 , digits = 3)
## [1] 0.715
#3. anti-asian hate crime: stepwise multiple regression with 10 variables
test_pred6.2 = predict(mod_asian_back2, newdata = test)
test_pred_df6.2=as.data.frame(test_pred6.2)
RMSE_6.2=sqrt(mean((test$ASIAN_HC_PER_CAPITA_POP - test_pred_df6.2$test_pred6)^2))
N_RMSE_6.2=RMSE_6.2/(max(test$ASIAN_HC_PER_CAPITA_POP)-min(test$ASIAN_HC_PER_CAPITA_POP))
round(RMSE_6.2 , digits = 3)
## [1] 1.203
round(N_RMSE_6.2 , digits = 3)
## [1] 0.599
#4. all racial hate crime: knn with 10 variables
test_pred = predict(simple_fit3, newdata = test)
test_pred_df=as.data.frame(test_pred)
RMSE_1=sqrt(mean((test$ALL_HC_PER_CAPITA - test_pred_df$test_pred)^2))
N_RMSE_1=RMSE_1/(max(test$ALL_HC_PER_CAPITA)-min(test$ALL_HC_PER_CAPITA))
round(RMSE_1, digits = 3)
## [1] 8.099
round(N_RMSE_1, digits = 3)
## [1] 0.261
#5. anti-black hate crime: knn with 10 variables
test_pred2 = predict(simple_fit4, newdata = test)
test_pred_df2=as.data.frame(test_pred2)
RMSE_2=sqrt(mean((test$BLACK_HC_PER_CAPITA_POP - test_pred_df2$test_pred2)^2))
N_RMSE_2=RMSE_2/(max(test$BLACK_HC_PER_CAPITA_POP)-min(test$BLACK_HC_PER_CAPITA_POP))
round(RMSE_2 , digits = 3)
## [1] 3.539
round(N_RMSE_2 , digits = 3)
## [1] 0.288
#6. anti-asian hate crime: knn with 10 variables
test_pred3 = predict(simple_fit5, newdata = test)
test_pred_df3=as.data.frame(test_pred3)
RMSE_3=sqrt(mean((test$ASIAN_HC_PER_CAPITA_POP - test_pred_df3$test_pred3)^2))
N_RMSE_3=RMSE_3/(max(test$ASIAN_HC_PER_CAPITA_POP)-min(test$ASIAN_HC_PER_CAPITA_POP))
round(RMSE_3 , digits = 3)
## [1] 0.56
round(N_RMSE_3 , digits = 3)
## [1] 0.279
4.2. Plot predicted (using knn) vs. actual value
t1<-data.frame(hate_crime=test$ALL_HC_PER_CAPITA, state=as.factor(c("AR","CA","DE","HI","KS","MD","MA","MS","NM","NY","WY")), value=c(rep("actual", length(test$ALL_HC_PER_CAPITA))))
t2<-data.frame(hate_crime=test_pred_df$test_pred, state=as.factor(c("AR","CA","DE","HI","KS","MD","MA","MS","NM","NY","WY")), value=c(rep("predicted", length(test$ALL_HC_PER_CAPITA))))
total<-rbind(t1, t2)
ggplot(data=total, aes(x=state, y=hate_crime, fill=value)) +
geom_bar(stat="identity", color="black", position=position_dodge())+
theme_minimal()+scale_fill_manual(values=c(culer[1],rgb(1,0,0,.6)))+ ggtitle("Actual vs. Predicted: All Racial Hate Crime in Test Data") +
xlab("state") + ylab("number of racial hate crime")

t3<-data.frame(hate_crime=test$BLACK_HC_PER_CAPITA_POP, state=as.factor(c("AR","CA","DE","HI","KS","MD","MA","MS","NM","NY","WY")), value=c(rep("actual", length(test$BLACK_HC_PER_CAPITA_POP))))
t4<-data.frame(hate_crime=test_pred_df2$test_pred, state=as.factor(c("AR","CA","DE","HI","KS","MD","MA","MS","NM","NY","WY")), value=c(rep("predicted", length(test$BLACK_HC_PER_CAPITA_POP))))
total<-rbind(t3, t4)
ggplot(data=total, aes(x=state, y=hate_crime, fill=value)) +
geom_bar(stat="identity", color="black", position=position_dodge())+
theme_minimal()+scale_fill_manual(values=c(culer[2],rgb(0,0,1,.6)))+ ggtitle("Actual vs. Predicted: Anti-Black Hate Crime in Test Data") +
xlab("state") + ylab("number of racial hate crime")

t5<-data.frame(hate_crime=test$ASIAN_HC_PER_CAPITA_POP, state=as.factor(c("AR","CA","DE","HI","KS","MD","MA","MS","NM","NY","WY")), value=c(rep("actual", length(test$ASIAN_HC_PER_CAPITA_POP))))
t6<-data.frame(hate_crime=test_pred_df3$test_pred, state=as.factor(c("AR","CA","DE","HI","KS","MD","MA","MS","NM","NY","WY")), value=c(rep("predicted", length(test$ASIAN_HC_PER_CAPITA_POP))))
total<-rbind(t5, t6)
ggplot(data=total, aes(x=state, y=hate_crime, fill=value)) +
geom_bar(stat="identity", color="black", position=position_dodge())+
theme_minimal()+scale_fill_manual(values=c(rgb(0,0.8,0.1,.6),rgb(0,0.5,0.2,.6)))+ ggtitle("Actual vs. Predicted: Anti-Asian Hate Crime in Test Data") +
xlab("state") + ylab("number of racial hate crime")
